######################################
### Performing Nationalism         ###
### American Sociological Review   ###
### [Replication Code]             ###
### Author: Lingxiao Chen          ###
### Date: March 2, 2026            ###
######################################

### Set-up 
## Clean the working environment and set up the working directory
setwd("/Users/Desktop/Data")#set your working directory here

## Load the required packages ------------------------
library(tidyverse)
library(knitr)
library(effectsize)

## Import the dataset ------------------------
celebrity_data <- read.csv("Performing_Nationalism_Replication_Data.csv",stringsAsFactors = TRUE)


# =============================================================================
# SECTION 1: Sample Demographics 
# =============================================================================
## Age: Clean year_born and calculate current age
celebrity_data <- celebrity_data %>% 
  mutate(year_born = as.numeric(year_born),
         year_born = ifelse(year_born < 1930, NA, year_born),#filter outliers 
         Age = 2024 - year_born,
         Age_z = as.numeric(scale(Age)))

summary(celebrity_data$Age)

## Gender: Set male as the reference category 
celebrity_data <- celebrity_data %>%
  mutate(gender = as.character(gender),
         gender = ifelse(gender == "其他" | is.na(gender), "Prefer Not to Answer", gender),
         gender = factor(gender, 
                         levels = c("男性", "女性", "Prefer Not to Answer"), 
                         labels = c("Male", "Female", "Prefer Not to Answer")),
         gender = relevel(gender, ref = "Male"))

## Region: Based on National Bureau of Statistics census categories
east      <- c("北京","天津","河北","上海","江苏","浙江","福建","山东","广东","海南")
central   <- c("山西","安徽","江西","河南","湖北","湖南")
west      <- c("内蒙古","广西","重庆","四川","贵州","云南","西藏","陕西","甘肃","青海","宁夏","新疆")
northeast <- c("辽宁","吉林","黑龙江")

celebrity_data <- celebrity_data %>%
  mutate(region = str_squish(as.character(region))) %>%
  mutate(region4 = case_when(
    region %in% east ~ "East",
    region %in% central ~ "Central",
    region %in% west ~ "West",
    region %in% northeast ~ "Northeast",
    TRUE ~ NA_character_
  ),
  region4 = factor(region4, levels = c("East","Central","West","Northeast")))

## Hukou status
hukou_map <- c(
  "城镇户籍（非农户籍）" = "urban",
  "农业户籍"             = "rural",
  "其他（请注明）"       = NA
)

celebrity_data <- celebrity_data %>%
  mutate(
    hukou       = hukou_map[str_trim(as.character(hukou))],
    hukou_urban = as.numeric(hukou == "urban")  # 1 = urban, 0 = rural
  )

table(celebrity_data$hukou_urban, useNA = "always")

## Education 
celebrity_data <- celebrity_data %>%
  mutate(edu_raw = as.character(edu) %>% str_squish())
edu_levels_cn  <- c("小学及以下","初中","高中","大专","大学本科","硕士","博士")
edu_labels_en  <- c("Elementary or below","Middle School","High School",
                    "Associate Degree","Bachelor's","Master's","PhD")

celebrity_data <- celebrity_data %>%
  mutate(
    edu_ord       = factor(str_squish(as.character(edu)),
                           levels  = edu_levels_cn,
                           labels  = edu_labels_en,
                           ordered = TRUE),
    edu_numeric   = as.integer(edu_ord),          # ordinal: 1 (lowest) – 7 (highest)
    edu_numeric_z = as.numeric(scale(edu_numeric)),
    edu_3         = case_when(                     # collapsed 3-category version
      edu_numeric <= 2 ~ "Lower (Elementary–Middle)",
      edu_numeric == 3 ~ "High School",
      edu_numeric >= 4 ~ "Associate+",
      TRUE             ~ NA_character_
    ),
    edu_3 = factor(edu_3,
                   levels = c("Lower (Elementary–Middle)", "High School", "Associate+"))
  )

## Income 
# Parse midpoints from raw Chinese income-range strings, then derive a 3-level bracket and a log-standardised continuous measure for analyses.
celebrity_data <- celebrity_data %>%
  mutate(
    income_raw = as.character(income) %>%
      str_squish() %>%
      str_replace_all("[，,]",  "") %>%   # remove full/half-width commas
      str_replace_all("元",     "") %>%   # drop currency character
      str_replace_all("[–—－]", "-") %>%  # normalise dashes to hyphen
      str_replace_all("\\s",    ""),      # remove remaining whitespace
    # Extract lower and upper bounds, assign midpoint
    topcoded   = str_detect(income_raw, "及以上|\\+$"),
    nums       = str_extract_all(income_raw, "\\d+"),
    low        = as.numeric(sapply(nums, function(x) x[1])),
    high       = as.numeric(sapply(nums, function(x) x[2])),
    income_mid = case_when(
      topcoded                    ~ 110000,           # top-coded: assigned 110k
      !is.na(low) & !is.na(high) ~ (low + high) / 2, # interval midpoint
      TRUE                        ~ NA_real_
    ),
    # Collapse into 3-level bracket (used in analyses)
    income_bracket = case_when(
      income_mid <= 40000                          ~ "Lower Income (<40k)",
      income_mid > 40000 & income_mid <= 100000   ~ "Upper Middle Income (40k-100k)",
      income_mid > 100000                         ~ "High Income (>100k)",
      TRUE                                         ~ NA_character_
    ),
    income_bracket = factor(income_bracket,
                            levels = c("Lower Income (<40k)",
                                       "Upper Middle Income (40k-100k)",
                                       "High Income (>100k)")),
    # Log-standardised continuous measure (used in OLS models)
    income_log   = ifelse(income_mid > 0, log(income_mid), NA_real_),
    income_log_z = as.numeric(scale(income_log))
  ) %>%
  select(-income_raw, -nums, -low, -high, -topcoded)
    
    
## Party Membership 
missing_party_status <- sum(is.na(celebrity_data$party_status))
unique(celebrity_data$party_status)
celebrity_data <- celebrity_data %>%
  mutate(party_status = case_when(
    party_status == "是" ~ "member",
    party_status == "否" ~ "non-member",
    TRUE ~ NA_character_  # Anything else becomes NA
  )) %>%
  mutate(party_status_member = as.numeric(party_status == "member"))  # Convert to 1 (member) / 0 (non-member)

## Political knowledge index, correct answer as of Fall 2024 
table(celebrity_data$us_senate, useNA = "ifany")
table(celebrity_data$us_rep, useNA = "ifany")
table(celebrity_data$us_supreme_court, useNA = "ifany")
table(celebrity_data$us_first_amend, useNA = "ifany")
table(celebrity_data$japan_pm, useNA = "ifany")
table(celebrity_data$british_pm, useNA = "ifany")
table(celebrity_data$ccp_standing_comm, useNA = "ifany")
levels(celebrity_data$us_senate)

#Create correctness variables 
# U.S. Senate majority (Correct in Sept 2024 = 民主党)
celebrity_data$senate_correct <- ifelse(
  celebrity_data$us_senate == "民主党", 1, 0
)

# U.S. House majority (Correct = 共和党)
celebrity_data$house_correct <- ifelse(
  celebrity_data$us_rep == "共和党", 1, 0
)

# U.S. Supreme Court tenure (Correct = 终身任职)
celebrity_data$supreme_correct <- ifelse(
  celebrity_data$us_supreme_court == "终身任职", 1, 0
)

# First Amendment (Correct = 宗教信仰自由)
celebrity_data$firstamend_correct <- ifelse(
  celebrity_data$us_first_amend == "宗教信仰自由", 1, 0
)

# Japan PM (Correct = 岸田文雄)
celebrity_data$japanpm_correct <- ifelse(
  celebrity_data$japan_pm == "岸田文雄", 1, 0
)

# UK PM (Correct = 基尔·斯塔默)
celebrity_data$ukpm_correct <- ifelse(
  celebrity_data$british_pm == "基尔·斯塔默", 1, 0
)

# NOT in the Politburo Standing Committee (Correct = 栗战书)
celebrity_data$pscc_correct <- ifelse(
  celebrity_data$ccp_standing_comm == "栗战书", 1, 0
)

celebrity_data$pol_knowledge_sum <-
  celebrity_data$senate_correct +
  celebrity_data$house_correct +
  celebrity_data$supreme_correct +
  celebrity_data$firstamend_correct +
  celebrity_data$japanpm_correct +
  celebrity_data$ukpm_correct +
  celebrity_data$pscc_correct

table(celebrity_data$pol_knowledge_sum)
summary(celebrity_data$pol_knowledge_sum)


## Entertainment spending and entertainment news exposure
celebrity_data <- celebrity_data %>% 
  mutate(
    entertain_news = str_trim(as.character(entertain_news)),
    entertain_spending = str_trim(as.character(entertain_spending)),
    
# Recode to *numbers*, stored into new *_n variables
    entertain_news_n = case_when(
      entertain_news == "完全没看过"   ~ 0,
      entertain_news == "一个月1～2次" ~ 1,
      entertain_news == "一周1～2次"   ~ 2,
      entertain_news == "一周3～5次"   ~ 3,
      entertain_news == "一周5次以上" ~ 4,
      TRUE                            ~ NA_real_
    ),
    
    entertain_spending_n = case_when(
      entertain_spending == "0元"            ~ 0,
      entertain_spending == "1元～100元"     ~ 1,
      entertain_spending == "101元～500元"   ~ 2,
      entertain_spending == "501元～1000元"  ~ 3,
      entertain_spending == "1000元以上"     ~ 4,
      TRUE                                   ~ NA_real_
    ),
# Ordered factors 
entertain_news_f = factor(
  entertain_news_n,
  levels = 0:4,
  labels = c("Never", "Monthly", "Weekly", "3-5x/Week", "5+ Times/Week"),
  ordered = TRUE
),

entertain_spending_f = factor(
  entertain_spending_n,
  levels = 0:4,
  labels = c("0 RMB", "1-100 RMB", "101-500 RMB", "501-1000 RMB", "1000+ RMB"),
  ordered = TRUE
)
  )

# Numeric
celebrity_data <- celebrity_data %>%
  mutate(
    entertain_news_n_z = scale(entertain_news_n)[, 1],
    entertain_spending_n_z = scale(entertain_spending_n)[, 1]
  )

# Spearman correlation between news exposure and entertainment spending
cor(celebrity_data$entertain_news_n, celebrity_data$entertain_spending_n,
    use = "complete.obs", method = "spearman")


# =============================================================================
# SECTION 2: Transgressions Severity Rating and Ranking
# =============================================================================

# Rename and recode severity items 
celebrity_data <- celebrity_data %>% 
  rename(
    mis_fashioncollab = misconduct_scale_1,
         mis_taiwan = misconduct_scale_2,
         mis_japanshrine = misconduct_scale_3,
         mis_cheat = misconduct_scale_4,
         mis_drunkdrive = misconduct_scale_5,
         mis_drug = misconduct_scale_6,
         mis_prostitution = misconduct_scale_7,
         mis_tax = misconduct_scale_8
    ) %>% 
  mutate(across(matches("^mis_"), ~ dplyr::recode(as.character(.x),
                                                  "完全不严重" = 0,
                                                  "不太严重" = 1,
                                                  "比较严重" = 2,
                                                  "非常严重" = 3,
                                                  "极其严重" = 4,
                                                  "不知道/不想回答" = NA_real_
  )))

# Verify the recoding
celebrity_data %>% 
  dplyr::select(matches("^mis_")) %>% 
  head()

#Look at the distribution of the severity rating of the transgressions 
mis_counts_all_levels <- lapply(celebrity_data %>% 
                                  dplyr::select(matches("^mis_")),
                                table, useNA = "ifany")
print(mis_counts_all_levels)

# ── Paired t-test: Political vs. Nonpolitical severity ratings ────────────────
# Pivot to long, assign category, then average per respondent per category

celebrity_data <- celebrity_data %>%
  mutate(respondent_id = row_number())

celebrity_data_wide <- celebrity_data %>%
  pivot_longer(
    cols      = c(mis_japanshrine, mis_fashioncollab, mis_taiwan,
                  mis_drug, mis_tax, mis_drunkdrive, mis_prostitution, mis_cheat),
    names_to  = "misconduct_type",
    values_to = "severity_score"
  ) %>%
  mutate(
    misconduct_category = case_when(
      misconduct_type %in% c("mis_japanshrine", "mis_fashioncollab", "mis_taiwan")
      ~ "Political Misconduct",
      misconduct_type %in% c("mis_drug", "mis_tax", "mis_drunkdrive",
                             "mis_prostitution", "mis_cheat")
      ~ "Nonpolitical Misconduct"
    )
  ) %>%
  group_by(respondent_id, misconduct_category) %>%
  summarise(mean_severity = mean(severity_score, na.rm = TRUE), .groups = "drop") %>%
  pivot_wider(names_from = misconduct_category, values_from = mean_severity)

# Descriptive statistics
celebrity_data_wide %>%
  summarise(across(c(`Political Misconduct`, `Nonpolitical Misconduct`),
                   list(mean = ~ mean(.x, na.rm = TRUE),
                        sd   = ~   sd(.x, na.rm = TRUE))))

# Paired t-test: political misconducts rated significantly more severe
t.test(celebrity_data_wide$`Political Misconduct`,
       celebrity_data_wide$`Nonpolitical Misconduct`,
       paired = TRUE)

# ── Relative severity ranking (Figure 1) ─────────────────────────────────────
# Respondents rank-ordered all 8 transgressions; rank is reversed so that higher values = more severe (original: 1 = most severe → reversed: 8 = most severe)


misconduct_labels <- c(
  "1" = "Fashion Collaboration",
  "2" = "Taiwan Statement",
  "3" = "Japan Shrine Visit",
  "4" = "Extramarital Affair",
  "5" = "Drunk Driving",
  "6" = "Drug Abuse",
  "7" = "Prostitution",
  "8" = "Tax Evasion"
)

celebrity_data_long_relative <- celebrity_data %>%
  pivot_longer(
    cols      = starts_with("misconduct_ranking_"),
    names_to  = "rank_position",
    values_to = "misconduct_code"
  ) %>%
  filter(!is.na(misconduct_code)) %>%
  mutate(
    rank_numeric       = as.numeric(str_extract(rank_position, "\\d+")),
    reversed_rank      = 9 - rank_numeric,   # higher = more severe
    misconduct_label   = recode(as.character(misconduct_code), !!!misconduct_labels),
    misconduct_category = case_when(
      misconduct_label %in% c("Japan Shrine Visit", "Fashion Collaboration", "Taiwan Statement")
      ~ "Nationalist Transgressions",
      misconduct_label %in% c("Drug Abuse", "Tax Evasion", "Drunk Driving",
                              "Prostitution", "Extramarital Affair")
      ~ "Non-political Transgressions"
    )
  )

# Mean reversed rank per transgression (for plot)
misconduct_rank_summary <- celebrity_data_long_relative %>%
  group_by(misconduct_label, misconduct_category) %>%
  summarise(mean_rank = mean(reversed_rank, na.rm = TRUE), .groups = "drop") %>%
  arrange(desc(mean_rank))

print(misconduct_rank_summary)

# ── Figure 1 ──────────────────────────────────────────────────────────────────
cb_cols <- c("Nationalist Transgressions"    = "#298c8c", #CVD friendly
             "Non-political Transgressions"  = "#b8b8b8")

ggplot(misconduct_rank_summary,
       aes(x = reorder(misconduct_label, mean_rank),
           y = mean_rank,
           fill = misconduct_category)) +
  geom_col(width = 0.55, colour = "black", linewidth = 0.3) +
  geom_text(aes(label = sprintf("%.2f", mean_rank)),
            hjust = -0.15, colour = "black", size = 3.6) +
  scale_fill_manual(values = cb_cols) +
  scale_y_continuous(limits = c(0, 6.2), expand = c(0, 0)) +
  labs(
    title   = "Mean Relative Severity Rank of Celebrity Transgressions",
    x       = NULL,
    y       = "Mean Relative Rank (1 = least severe, 8 = most severe)",
    fill    = NULL,
    caption = "Ranks reversed so that 8 = most severe."
  ) +
  coord_flip() +
  theme_minimal(base_size = 12) +
  theme(
    panel.grid.major.y = element_blank(),
    panel.grid.minor   = element_blank(),
    panel.grid.major.x = element_line(colour = "grey80"),
    legend.position    = "bottom",
    plot.title         = element_text(face = "bold"),
    plot.margin        = margin(t = 10, r = 20, b = 10, l = 10, unit = "pt")
  )

ggsave("figure1_severityrank.tiff",
       width = 7, height = 6, units = "in",
       dpi = 300, compression = "lzw")


# =============================================================================
# SECTION 3: Regression analayses 
# =============================================================================
library(broom)
library(sandwich)
library(lmtest)
library(modelsummary)

# Build person-level analysis dataset ───────────────────────────────────────
# Nationalism Salience Index = mean(political ranks) - mean(nonpolitical ranks)
# Higher values = respondent rates nationalist transgressions as relatively more severe


analysis_data <- celebrity_data_long_relative %>%
  group_by(respondent_id, misconduct_category) %>%
  summarise(mean_rank = mean(reversed_rank, na.rm = TRUE), .groups = "drop") %>%
  pivot_wider(names_from = misconduct_category, values_from = mean_rank) %>%
  mutate(nationalism_salience = `Nationalist Transgressions` - `Non-political Transgressions`) %>%
  left_join(
    celebrity_data %>%
      dplyr::select(respondent_id,
                    Age_z, gender, Age,region4,            
                    income_log_z, income_bracket,
                    edu_numeric_z, edu_3,
                    hukou_urban, party_status_member,
                    entertain_news_n_z, entertain_spending_n_z,
                    pol_knowledge_sum),
    by = "respondent_id"
  ) %>%
  filter(!is.na(nationalism_salience)) %>% 
  mutate(
    gender        = factor(gender, levels = c("Male","Female")),
    region4       = factor(region4, levels = c("East","Central","West","Northeast")),
    income_bracket = factor(income_bracket,
                            levels = c("Lower Income (<40k)",
                                       "Upper Middle Income (40k-100k)",
                                       "High Income (>100k)")),
    edu_3         = factor(edu_3,
                           levels = c("Lower (Elementary–Middle)","High School","Associate+"))
  )

# ── OLS Models ────────────────────────────────────────────────────────────────
# Model 1: continuous SES (income and education as gradients)
mod1 <- lm(nationalism_salience ~
             Age_z + gender + region4 +
             income_log_z + edu_numeric_z +
             hukou_urban + party_status_member +
             entertain_news_n_z + entertain_spending_n_z +
             pol_knowledge_sum,
           data = analysis_data, na.action = na.exclude)

# Model 2: categorical SES (income and education as discrete brackets)
mod2 <- lm(nationalism_salience ~
             Age_z + gender + region4 +
             income_bracket + edu_3 +
             hukou_urban + party_status_member +
             entertain_news_n_z + entertain_spending_n_z +
             pol_knowledge_sum,
           data = analysis_data, na.action = na.exclude)

summary(mod1)
summary(mod2)


# ── Figure 2: Coefficient plot (Model 1, standardised β) ─────────────────────
# Display order and labels for the coefficient plot
desired_order <- c(
  "Age_z", "genderFemale",
  "region4Central", "region4West", "region4Northeast",
  "hukou_urban", "edu_numeric_z", "income_log_z",
  "party_status_member", "pol_knowledge_sum",
  "entertain_spending_n_z", "entertain_news_n_z"
)

nice_labels <- c(
  "Age_z"                  = "Age (z)",
  "genderFemale"           = "Female (vs Male)",
  "region4Central"         = "Central (vs East)",
  "region4West"            = "West (vs East)",
  "region4Northeast"       = "Northeast (vs East)",
  "hukou_urban"            = "Urban hukou (vs rural)",
  "edu_numeric_z"          = "Education (z)",
  "income_log_z"           = "Income (log, z)",
  "party_status_member"    = "CCP member",
  "pol_knowledge_sum"      = "Political knowledge (0–7)",
  "entertain_spending_n_z" = "Entertainment spending (z)",
  "entertain_news_n_z"     = "Entertainment news exposure (z)"
)

plot_data <- standardize_parameters(mod1, method = "refit") %>%
  as.data.frame() %>%
  filter(Parameter %in% desired_order) %>%
  transmute(
    term       = factor(Parameter, levels = desired_order),
    estimate   = Std_Coefficient,
    conf.low   = CI_low,
    conf.high  = CI_high
  ) %>%
  arrange(term) %>%
  mutate(
    term_label = fct_rev(fct_inorder(recode(term, !!!nice_labels)))
  )

fig2 <- ggplot(plot_data, aes(x = estimate, y = term_label)) +
  geom_vline(xintercept = 0, linetype = "dashed", colour = "gray50", linewidth = 0.5) +
  geom_errorbarh(aes(xmin = conf.low, xmax = conf.high),
                 height = 0.25, linewidth = 0.7, colour = "gray30") +
  geom_point(size = 3, colour = "black") +
  # Filled point highlights statistically significant coefficients
  geom_point(data = ~ filter(., conf.low > 0 | conf.high < 0),
             size = 3, shape = 16, colour = "black") +
  labs(x = "Standardized OLS Estimate (β)", y = NULL) +
  theme_minimal(base_size = 12) +
  theme(
    axis.text.y        = element_text(size = 11.5, colour = "black"),
    panel.grid.major.y = element_blank(),
    panel.grid.minor   = element_blank(),
    plot.margin        = margin(t = 10, r = 20, b = 10, l = 10, unit = "pt")
  )

fig2
ggsave("figure2_coefplot.tiff", fig2, width = 7, height = 5,
       units = "in", dpi = 300, compression = "lzw") 

# ── Appendix Tables: Regression output (Models 1 & 2) ────────────────────────
coef_labels <- c(
  "(Intercept)"                                  = "Intercept",
  "Age_z"                                        = "Age (z)",
  "genderFemale"                                 = "Female (vs Male)",
  "region4Central"                               = "Central (vs East)",
  "region4West"                                  = "West (vs East)",
  "region4Northeast"                             = "Northeast (vs East)",
  "income_log_z"                                 = "Income (log, z)",
  "hukou_urban"                                  = "Urban hukou (vs rural)",
  "party_status_member"                          = "CCP member",
  "edu_numeric_z"                                = "Education (z)",
  "entertain_news_n_z"                           = "Entertainment news exposure (z)",
  "entertain_spending_n_z"                       = "Entertainment spending (z)",
  "pol_knowledge_sum"                            = "Political knowledge (0–7)",
  "income_bracketUpper Middle Income (40k-100k)" = "Upper-middle income (40k–100k)",
  "income_bracketHigh Income (>100k)"            = "High income (>100k)",
  "edu_3High School"                             = "High school (vs low)",
  "edu_3Associate+"                              = "Associate+ (vs low)"
)

gof_map <- data.frame(
  raw   = c("nobs", "r.squared", "adj.r.squared", "rmse"),
  clean = c("N", "R²", "Adj. R²", "RMSE"),
  fmt   = 2
)

# Export combined table to Word (Appendix D)
modelsummary(
  list("Model 1: Continuous SES" = mod1,
       "Model 2: Categorical SES" = mod2),
  coef_map = coef_labels,
  gof_map  = gof_map,
  stars    = TRUE,
  output   = "appendix_regressions.docx"
)

# =============================================================================
# SECTION 4: Subgroup and Weighted Analyses (Online Appendix, Part E)
# =============================================================================
library(scales)
library(survey)

# ── Weighted sample: respondents aged 18–59 ───────────────────────────────────
# One respondent aged 60 is excluded; post-stratification targets are for 18–59
weighted_data <- celebrity_data %>%
  filter(Age >= 18, Age <= 59) %>%
  mutate(
    age_cat = cut(Age,
                  breaks = c(18, 20, 30, 40, 50, 60),
                  labels = c("18-19", "20-29", "30-39", "40-49", "50-59"),
                  right  = FALSE),
    edu_cat = edu_3   # reuse collapsed education factor from Section 1
  )

# ── Post-stratification targets (Chinese census, 18–59 population) ────────────
N <- nrow(weighted_data)

# Age targets: renormalised to 18–59 from national census proportions
age_pop <- data.frame(
  age_cat = c("18-19", "20-29", "30-39", "40-49", "50-59"),
  Freq    = c(0.0334,   0.1967,  0.2632,  0.2443,  0.2624) * N
)

# Education targets: census figures for population aged 15+
edu_pop <- data.frame(
  edu_cat = c("Lower (Elementary–Middle)", "High School", "Associate+"),
  Freq    = c(0.6301,                       0.1821,        0.1878) * N
)

# ── Rake to age and education margins ────────────────────────────────────────
des0 <- svydesign(ids = ~1, data = weighted_data, weights = ~1)

des_raked <- rake(
  design             = des0,
  sample.margins     = list(~age_cat, ~edu_cat),
  population.margins = list(age_pop, edu_pop)
)

# Verify raking converged to targets
svytable(~age_cat, des_raked) / N
svytable(~edu_cat, des_raked) / N
summary(weights(des_raked))

# ── Extract weights for downstream use ───────────────────────────────────────
weights_df <- data.frame(
  respondent_id = weighted_data$respondent_id,
  weight        = weights(des_raked)
)

# =============================================================================
# PART A: Subgroup Analysis by Education (College vs. Non-college)
# =============================================================================

celebrity_data <- celebrity_data %>%
  mutate(
    edu_college = factor(ifelse(edu_numeric >= 4, "College+", "Non-college"),
                         levels = c("Non-college", "College+"))
  )

# Mean absolute severity per respondent per category, by education subgroup
abs_subgroups <- celebrity_data %>%
  select(respondent_id, edu_college,
         mis_japanshrine, mis_fashioncollab, mis_taiwan,
         mis_drug, mis_tax, mis_drunkdrive, mis_prostitution, mis_cheat) %>%
  pivot_longer(cols = starts_with("mis_"), names_to = "mis", values_to = "sev") %>%
  mutate(
    category = if_else(
      mis %in% c("mis_japanshrine", "mis_fashioncollab", "mis_taiwan"),
      "Nationalist", "Nonpolitical"
    )
  ) %>%
  group_by(respondent_id, edu_college, category) %>%
  summarise(person_mean = mean(sev, na.rm = TRUE), .groups = "drop") %>%
  group_by(edu_college, category) %>%
  summarise(
    mean_sev = mean(person_mean, na.rm = TRUE),
    se       = sd(person_mean, na.rm = TRUE) / sqrt(sum(!is.na(person_mean))),
    n        = sum(!is.na(person_mean)),
    .groups  = "drop"
  )

print(abs_subgroups)

# Nationalism salience by education subgroup
rel_summary <- analysis_data %>%
  left_join(celebrity_data %>% select(respondent_id, edu_college), by = "respondent_id") %>%
  group_by(edu_college) %>%
  summarise(
    mean_diff = mean(nationalism_salience, na.rm = TRUE),
    se        = sd(nationalism_salience, na.rm = TRUE) / sqrt(sum(!is.na(nationalism_salience))),
    n         = sum(!is.na(nationalism_salience)),
    .groups   = "drop"
  )

print(rel_summary)

# One-sample t-tests: does each subgroup differ significantly from 0?
t.test(nationalism_salience ~ 1, mu = 0,
       data = analysis_data %>%
         left_join(celebrity_data %>% select(respondent_id, edu_college), by = "respondent_id") %>%
         filter(edu_college == "College+"))

t.test(nationalism_salience ~ 1, mu = 0,
       data = analysis_data %>%
         left_join(celebrity_data %>% select(respondent_id, edu_college), by = "respondent_id") %>%
         filter(edu_college == "Non-college"))

# OLS test of subgroup difference
summary(lm(nationalism_salience ~ edu_college, data = analysis_data %>%
             left_join(celebrity_data %>% select(respondent_id, edu_college),
                       by = "respondent_id")))

# =============================================================================
# PART B: Weighted Analyses (post-stratification raked weights, ages 18–59)
# =============================================================================
# ── Weighted absolute severity ratings ───────────────────────────────────────
person_severity_means <- celebrity_data %>%
  filter(Age >= 18, Age <= 59) %>%
  pivot_longer(
    cols      = c(mis_japanshrine, mis_fashioncollab, mis_taiwan,
                  mis_drug, mis_tax, mis_drunkdrive, mis_prostitution, mis_cheat),
    names_to  = "misconduct_type",
    values_to = "severity_score"
  ) %>%
  mutate(
    misconduct_category = case_when(
      misconduct_type %in% c("mis_japanshrine", "mis_fashioncollab", "mis_taiwan")
      ~ "Nationalist Transgressions",
      misconduct_type %in% c("mis_drug", "mis_tax", "mis_drunkdrive",
                             "mis_prostitution", "mis_cheat")
      ~ "Nonpolitical Transgressions"
    )
  ) %>%
  filter(!is.na(misconduct_category)) %>%
  group_by(respondent_id, misconduct_category) %>%
  summarise(mean_severity = mean(severity_score, na.rm = TRUE), .groups = "drop") %>%
  pivot_wider(names_from = misconduct_category, values_from = mean_severity) %>%
  left_join(weights_df, by = "respondent_id") %>%
  filter(!is.na(`Nationalist Transgressions`) & !is.na(`Nonpolitical Transgressions`))

des_paired_severity <- svydesign(ids = ~1, data = person_severity_means, weights = ~weight)

# Weighted paired t-test: nationalist vs. nonpolitical severity
svyttest(`Nationalist Transgressions` - `Nonpolitical Transgressions` ~ 0,
         des_paired_severity)

# ── Weighted relative ranking ─────────────────────────────────────────────────
person_ranking_means <- celebrity_data_long_relative %>%
  select(respondent_id, misconduct_category, reversed_rank) %>%
  filter(respondent_id %in% weighted_data$respondent_id) %>%
  group_by(respondent_id, misconduct_category) %>%
  summarise(mean_rank = mean(reversed_rank, na.rm = TRUE), .groups = "drop") %>%
  pivot_wider(names_from = misconduct_category, values_from = mean_rank) %>%
  left_join(weights_df, by = "respondent_id")

des_paired_ranking <- svydesign(ids = ~1, data = person_ranking_means, weights = ~weight)

# Weighted paired t-test: nationalist vs. nonpolitical relative rank
svyttest(`Nationalist Transgressions` - `Non-political Transgressions` ~ 0,
         des_paired_ranking)

# ── Weighted regression models ────────────────────────────────────────────────
analysis_data_weighted <- analysis_data %>%
  left_join(weights_df, by = "respondent_id") %>%
  filter(!is.na(weight))

des_weighted <- svydesign(ids = ~1, data = analysis_data_weighted, weights = ~weight)

# Model 1: continuous SES
mod1_weighted <- svyglm(
  nationalism_salience ~
    Age_z + gender + region4 +
    income_log_z + edu_numeric_z +
    hukou_urban + party_status_member +
    entertain_news_n_z + entertain_spending_n_z +
    pol_knowledge_sum,
  design = des_weighted
)

# Model 2: categorical SES
mod2_weighted <- svyglm(
  nationalism_salience ~
    Age_z + gender + region4 +
    income_bracket + edu_3 +
    hukou_urban + party_status_member +
    entertain_news_n_z + entertain_spending_n_z +
    pol_knowledge_sum,
  design = des_weighted
)

summary(mod1_weighted)
summary(mod2_weighted)

# ── Export weighted regression tables (Appendix E) ───────────────────────────

modelsummary(
  list("Model 1: Continuous SES (weighted)"  = mod1_weighted,
       "Model 2: Categorical SES (weighted)" = mod2_weighted),
  coef_map = coef_labels,
  gof_map  = gof_map,
  stars    = TRUE,
  vcov     = list(vcovHC(mod1_weighted, type = "HC3"),
                  vcovHC(mod2_weighted, type = "HC3")),
  output   = "appendix_weighted_regressions.docx"
)
